package net.nutch.util;
import java.net.URL;
import org.w3c.dom.*;
import org.w3c.dom.html.*;
import org.apache.html.dom.*;
/**
* Class for parsing META Directives from DOM trees. This class
* currently handles Robots META directives (all, none, nofollow,
* noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
* instructions.
*/
public class RobotsMetaProcessor {
/**
* Utility class with indicators for the robots directives "noindex"
* and "nofollow", and HTTP-EQUIV/no-cache
*/
public static class RobotsMetaIndicator {
private boolean noIndex= false;
private boolean noFollow= false;
private boolean noCache= false;
private URL baseHref= null;
/**
* Sets <code>noIndex</code>, <code>noFollow</code> and
* <code>noCache</code> to <code>false</code>.
*/
public void reset() {
noIndex= false;
noFollow= false;
noCache= false;
baseHref= null;
}
/**
* Sets <code>noFollow</code> to <code>true</code>.
*/
public void setNoFollow() {
noFollow= true;
}
/**
* Sets <code>noIndex</code> to <code>true</code>.
*/
public void setNoIndex() {
noIndex= true;
}
/**
* Sets <code>noCache</code> to <code>true</code>.
*/
public void setNoCache() {
noCache= true;
}
/**
* Sets the <code>baseHref</code>.
*/
public void setBaseHref(URL baseHref) {
this.baseHref= baseHref;
}
/**
* Returns the current value of <code>noIndex</code>.
*/
public boolean getNoIndex() {
return noIndex;
}
/**
* Returns the current value of <code>noFollow</code>.
*/
public boolean getNoFollow() {
return noFollow;
}
/**
* Returns the current value of <code>noCache</code>.
*/
public boolean getNoCache() {
return noCache;
}
/**
* Returns the <code>baseHref</code>, if set, or <code>null</code>
* otherwise.
*/
public URL getBaseHref() {
return baseHref;
}
}
/**
* Sets the indicators in <code>robotsMeta</code> to appropriate
* values, based on any META tags found under the given
* <code>node</code>.
*/
public static final void getRobotsMetaDirectives(
RobotsMetaIndicator robotsMeta, Node node, URL currURL) {
robotsMeta.reset();
getRobotsMetaDirectivesHelper(robotsMeta, node, currURL);
}
private static final void getRobotsMetaDirectivesHelper(
RobotsMetaIndicator robotsMeta, Node node, URL currURL) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("BODY".equals(node.getNodeName())) {
// META tags should not be under body
return;
}
if ("META".equals(node.getNodeName())) {
NamedNodeMap attrs= node.getAttributes();
Node nameNode= attrs.getNamedItem("name");
if (nameNode != null) {
if ("robots".equalsIgnoreCase(nameNode.getNodeValue())) {
Node contentNode= attrs.getNamedItem("content");
if (contentNode != null) {
String directives=
contentNode.getNodeValue().toLowerCase();
int index= directives.indexOf("none");
if (index >= 0) {
robotsMeta.setNoIndex();
robotsMeta.setNoFollow();
}
index= directives.indexOf("all");
if (index >= 0) {
// do nothing...
}
index= directives.indexOf("noindex");
if (index >= 0) {
robotsMeta.setNoIndex();
}
index= directives.indexOf("nofollow");
if (index >= 0) {
robotsMeta.setNoFollow();
}
}
} // end if (name == robots)
} // end if (nameNode != null)
Node HTTPEquivNode= attrs.getNamedItem("http-equiv");
if ( (HTTPEquivNode != null)
&& ("Pragma".equalsIgnoreCase(HTTPEquivNode.getNodeValue())) ) {
Node contentNode= attrs.getNamedItem("content");
if (contentNode != null) {
String content= contentNode.getNodeValue().toLowerCase();
int index= content.indexOf("no-cache");
if (index >= 0)
robotsMeta.setNoCache();
}
}
} else if ("BASE".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs= node.getAttributes();
Node hrefNode= attrs.getNamedItem("href");
if (hrefNode != null) {
String urlString= hrefNode.getNodeValue();
URL url= null;
try {
if (currURL == null)
url= new URL(urlString);
else
url= new URL(currURL, urlString);
} catch (Exception e) {
;
}
if (url != null)
robotsMeta.setBaseHref(url);
}
}
}
NodeList children = node.getChildNodes();
if ( children != null ) {
int len = children.getLength();
for ( int i = 0; i < len; i++ ) {
getRobotsMetaDirectivesHelper(robotsMeta, children.item(i), currURL);
}
}
}
}